Loading required packages
#install.packages("ROCR")
library(ggplot2)
Warning message:
package ‘arules’ was built under R version 3.4.4
library(corrplot)
package ‘corrplot’ was built under R version 3.4.2corrplot 0.84 loaded
library(ROCR)
getwd()
[1] "/Users/sravan/Documents/GitHub/DataAnalytics/DataScience_/Telco Customer"
Reading input file
data.set.orig = read.csv(file = "Datasets/Teleco_Cust_Attr.csv", header = T)
head(data.set.orig)
summary(data.set.orig)
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines
0002-ORFBO: 1 Female:3488 Min. :0.0000 No :3641 No :4933 Min. : 0.00 No : 682 No :3390
0003-MKNFE: 1 Male :3555 1st Qu.:0.0000 Yes:3402 Yes:2110 1st Qu.: 9.00 Yes:6361 No phone service: 682
0004-TLHLJ: 1 Median :0.0000 Median :29.00 Yes :2971
0011-IGKFF: 1 Mean :0.1621 Mean :32.37
0013-EXCHZ: 1 3rd Qu.:0.0000 3rd Qu.:55.00
0013-MHZWF: 1 Max. :1.0000 Max. :72.00
(Other) :7037
InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport
DSL :2421 No :3498 No :3088 No :3095 No :3473
Fiber optic:3096 No internet service:1526 No internet service:1526 No internet service:1526 No internet service:1526
No :1526 Yes :2019 Yes :2429 Yes :2422 Yes :2044
StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod
No :2810 No :2785 Month-to-month:3875 No :2872 Bank transfer (automatic):1544
No internet service:1526 No internet service:1526 One year :1473 Yes:4171 Credit card (automatic) :1522
Yes :2707 Yes :2732 Two year :1695 Electronic check :2365
Mailed check :1612
MonthlyCharges TotalCharges Churn
Min. : 18.25 Min. : 18.8 No :5174
1st Qu.: 35.50 1st Qu.: 401.4 Yes:1869
Median : 70.35 Median :1397.5
Mean : 64.76 Mean :2283.3
3rd Qu.: 89.85 3rd Qu.:3794.7
Max. :118.75 Max. :8684.8
NA's :11
Data Preprocessing
Dropping customerID variable
data.set.orig$customerID = NULL
Checking for Missing Values and imputing
sapply(data.set.orig, function(x) sum(is.na(x)))
gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService
0 0 0 0 0 0 0 0
OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling
0 0 0 0 0 0 0 0
PaymentMethod MonthlyCharges TotalCharges Churn
0 0 11 0
Handling NAs in TotalCharges
data.set.orig$TotalCharges = ifelse(is.na(data.set.orig$TotalCharges),
data.set.orig$tenure*data.set.orig$MonthlyCharges,
data.set.orig$TotalCharges)
sapply(data.set.orig, function(x) sum(is.na(x)))
gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService
0 0 0 0 0 0 0 0
OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling
0 0 0 0 0 0 0 0
PaymentMethod MonthlyCharges TotalCharges Churn
0 0 0 0
Converting SeniorCitizen to factor
data.set.orig$SeniorCitizen = as.factor(data.set.orig$SeniorCitizen)
EDA
i=1
for(i in 1:ncol(data.set.orig)){
#print(colnames(data.set.orig)[i])
if(is.factor(data.set.orig[,i])){
print(ggplot(data.set.orig,aes_string("Churn",colnames(data.set.orig)[i]))
+geom_jitter(aes(col=Churn)))
}
}
i=1
for(i in 1:ncol(data.set.orig)){
#print(colnames(data.set.orig)[i])
if(is.numeric(data.set.orig[,i])){
print(ggplot(data.set.orig,aes_string(colnames(data.set.orig)[i]))
+geom_density())
}
}
nums = unlist(lapply(data.set.orig, is.numeric))
temp = data.set.orig[,nums]
price.corplot = cor(temp)
corrplot(price.corplot, method="number")
print(ggplot(data.set.orig,aes(Churn,tenure))+geom_boxplot())
Dropping variables tenure and MonthlyCharges
#data.set.orig$tenure = NULL
#data.set.orig$MonthlyCharges = NULL
head(data.set.orig)
Transforming TotalCharges to Normal Distribution
print(ggplot(data.set.orig,aes(TotalCharges))+geom_density())
print(ggplot(data.set.orig,aes(sqrt(TotalCharges)))+geom_density())
print(ggplot(data.set.orig,aes(Churn,TotalCharges))+geom_boxplot())
print(ggplot(data.set.orig,aes(tenure))+geom_density())
print(ggplot(data.set.orig,aes(sin(tenure)))+geom_density())
print(ggplot(data.set.orig,aes(Churn,tenure))+geom_boxplot())
print(ggplot(data.set.orig[data.set.orig$tenure<30,],aes((tenure)))+geom_density())
print(ggplot(data.set.orig[data.set.orig$tenure<30,],aes(sqrt(tenure)))+geom_density())
print(ggplot(data.set.orig,aes(MonthlyCharges))+geom_density())
print(ggplot(data.set.orig,aes(sqrt(MonthlyCharges)))+geom_density())
print(ggplot(data.set.orig,aes(Churn,MonthlyCharges))+geom_boxplot())
Subsetting Required Variables
colnames(data.set.orig)
[1] "gender" "SeniorCitizen" "Partner" "Dependents" "tenure" "PhoneService" "MultipleLines"
[8] "InternetService" "OnlineSecurity" "OnlineBackup" "DeviceProtection" "TechSupport" "StreamingTV" "StreamingMovies"
[15] "Contract" "PaperlessBilling" "PaymentMethod" "MonthlyCharges" "TotalCharges" "Churn"
data.set = data.set.orig[,c("SeniorCitizen","Dependents","PhoneService","MultipleLines","InternetService" , "OnlineSecurity" , "OnlineBackup","DeviceProtection" ,"TechSupport" , "StreamingTV" , "StreamingMovies" , "Contract","PaperlessBilling", "PaymentMethod" , "tenure","Churn")]
Test Train Split
sample = sample(1:nrow(data.set),0.7*nrow(data.set))
train.data.set = data.set[sample,]
test.data.set = data.set[-sample,]
rbind(nrow(data.set),nrow(train.data.set),nrow(test.data.set))
[,1]
[1,] 7043
[2,] 4930
[3,] 2113
Applying Logistic Regression
m1 <- glm (Churn ~ ., data = train.data.set, family = binomial)
summary(m1)
Call:
glm(formula = Churn ~ ., family = binomial, data = train.data.set)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.0212 -0.6802 -0.2898 0.6851 3.1468
Coefficients: (7 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.362406 0.178326 -2.032 0.042127 *
SeniorCitizen1 0.308995 0.100478 3.075 0.002103 **
DependentsYes -0.202522 0.098703 -2.052 0.040187 *
PhoneServiceYes -0.208588 0.153231 -1.361 0.173429
MultipleLinesNo phone service NA NA NA NA
MultipleLinesYes 0.241569 0.093717 2.578 0.009948 **
InternetServiceFiber optic 0.735241 0.107964 6.810 9.76e-12 ***
InternetServiceNo -0.705551 0.158469 -4.452 8.50e-06 ***
OnlineSecurityNo internet service NA NA NA NA
OnlineSecurityYes -0.344624 0.101221 -3.405 0.000662 ***
OnlineBackupNo internet service NA NA NA NA
OnlineBackupYes -0.137814 0.092464 -1.490 0.136104
DeviceProtectionNo internet service NA NA NA NA
DeviceProtectionYes 0.015020 0.094732 0.159 0.874019
TechSupportNo internet service NA NA NA NA
TechSupportYes -0.301941 0.102771 -2.938 0.003304 **
StreamingTVNo internet service NA NA NA NA
StreamingTVYes 0.361575 0.095672 3.779 0.000157 ***
StreamingMoviesNo internet service NA NA NA NA
StreamingMoviesYes 0.296168 0.095555 3.099 0.001939 **
ContractOne year -0.708044 0.127742 -5.543 2.98e-08 ***
ContractTwo year -1.442974 0.213616 -6.755 1.43e-11 ***
PaperlessBillingYes 0.334096 0.088554 3.773 0.000161 ***
PaymentMethodCredit card (automatic) -0.088979 0.135897 -0.655 0.512626
PaymentMethodElectronic check 0.302192 0.112333 2.690 0.007142 **
PaymentMethodMailed check -0.092960 0.133627 -0.696 0.486637
tenure -0.035018 0.002799 -12.509 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 5710.5 on 4929 degrees of freedom
Residual deviance: 4110.2 on 4910 degrees of freedom
AIC: 4150.2
Number of Fisher Scoring iterations: 6
p1 = predict(m1, train.data.set, type = 'response')
prediction from a rank-deficient fit may be misleading
pred2 = prediction(p1,train.data.set$Churn)
perf = performance(pred2,"tpr","fpr")
plot(perf,colorize = T)
t1 = table(train.data.set$Churn, p1 > 0.45)
t1
FALSE TRUE
No 3155 464
Yes 543 768
TN1 = t1[1]
FN1 = t1[2]
FP1 = t1[3]
TP1 = t1[4]
TN1
[1] 3155
FN1
[1] 543
FP1
[1] 464
TP1
[1] 768
Accuary1 = (TP1+TN1)/(TP1+TN1+FP1+FN1)
Precision1 = (TP1)/(TP1+FP1)
Recall1 = (TP1)/(TP1+FN1)
F11 = 2*Precision1*Recall1/(Precision1+Recall1)
Accuary1
[1] 0.7957404
Precision1
[1] 0.6233766
Recall1
[1] 0.5858124
F11
[1] 0.604011
p1 = predict(m1, test.data.set, type = 'response')
prediction from a rank-deficient fit may be misleading
pred2 = prediction(p1,test.data.set$Churn)
perf = performance(pred2,"tpr","fpr")
plot(perf,colorize = T)
t1 = table(test.data.set$Churn, p1 > 0.45)
t1
FALSE TRUE
No 1358 197
Yes 205 353
TN1 = t1[1]
FN1 = t1[2]
FP1 = t1[3]
TP1 = t1[4]
TN1
[1] 1358
FN1
[1] 205
FP1
[1] 197
TP1
[1] 353
Accuary1 = (TP1+TN1)/(TP1+TN1+FP1+FN1)
Precision1 = (TP1)/(TP1+FP1)
Recall1 = (TP1)/(TP1+FN1)
F11 = 2*Precision1*Recall1/(Precision1+Recall1)
Accuary1
[1] 0.8097492
Precision1
[1] 0.6418182
Recall1
[1] 0.6326165
F11
[1] 0.6371841
Model 2 Subsetting Required Variables
colnames(data.set.orig)
[1] "gender" "SeniorCitizen" "Partner" "Dependents" "tenure" "PhoneService" "MultipleLines"
[8] "InternetService" "OnlineSecurity" "OnlineBackup" "DeviceProtection" "TechSupport" "StreamingTV" "StreamingMovies"
[15] "Contract" "PaperlessBilling" "PaymentMethod" "MonthlyCharges" "TotalCharges" "Churn"
data.set = data.set.orig[,c("SeniorCitizen","Dependents","Partner","PhoneService","MultipleLines","InternetService" , "OnlineSecurity" , "OnlineBackup","DeviceProtection" , "StreamingTV" , "Contract", "PaymentMethod" , "tenure","Churn")]
Test Train Split
sample = sample(1:nrow(data.set),0.7*nrow(data.set))
train.data.set = data.set[sample,]
test.data.set = data.set[-sample,]
rbind(nrow(data.set),nrow(train.data.set),nrow(test.data.set))
[,1]
[1,] 7043
[2,] 4930
[3,] 2113
Applying Logistic Regression
m1 <- glm (Churn ~ ., data = train.data.set, family = binomial)
summary(m1)
Call:
glm(formula = Churn ~ ., family = binomial, data = train.data.set)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.9775 -0.6531 -0.2982 0.6974 3.1397
Coefficients: (5 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.106754 0.165398 -0.645 0.518642
SeniorCitizen1 0.320372 0.101173 3.167 0.001542 **
DependentsYes -0.268335 0.108552 -2.472 0.013437 *
PartnerYes 0.013469 0.091926 0.147 0.883515
PhoneServiceYes -0.558865 0.149902 -3.728 0.000193 ***
MultipleLinesNo phone service NA NA NA NA
MultipleLinesYes 0.340890 0.094422 3.610 0.000306 ***
InternetServiceFiber optic 1.000236 0.108441 9.224 < 2e-16 ***
InternetServiceNo -0.824248 0.160540 -5.134 2.83e-07 ***
OnlineSecurityNo internet service NA NA NA NA
OnlineSecurityYes -0.371946 0.101022 -3.682 0.000232 ***
OnlineBackupNo internet service NA NA NA NA
OnlineBackupYes -0.102816 0.091846 -1.119 0.262952
DeviceProtectionNo internet service NA NA NA NA
DeviceProtectionYes -0.083612 0.093285 -0.896 0.370090
StreamingTVNo internet service NA NA NA NA
StreamingTVYes 0.412817 0.090974 4.538 5.69e-06 ***
ContractOne year -0.591041 0.124561 -4.745 2.09e-06 ***
ContractTwo year -1.316618 0.207697 -6.339 2.31e-10 ***
PaymentMethodCredit card (automatic) -0.072965 0.137887 -0.529 0.596691
PaymentMethodElectronic check 0.498424 0.112925 4.414 1.02e-05 ***
PaymentMethodMailed check 0.057147 0.135206 0.423 0.672537
tenure -0.034847 0.002822 -12.348 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 5696.3 on 4929 degrees of freedom
Residual deviance: 4094.4 on 4912 degrees of freedom
AIC: 4130.4
Number of Fisher Scoring iterations: 6
p1 = predict(m1, train.data.set, type = 'response')
prediction from a rank-deficient fit may be misleading
pred2 = prediction(p1,train.data.set$Churn)
perf = performance(pred2,"tpr","fpr")
plot(perf,colorize = T)
t1 = table(train.data.set$Churn, p1 > 0.45)
t1
FALSE TRUE
No 3151 475
Yes 534 770
TN1 = t1[1]
FN1 = t1[2]
FP1 = t1[3]
TP1 = t1[4]
TN1
[1] 3151
FN1
[1] 534
FP1
[1] 475
TP1
[1] 770
Accuary1 = (TP1+TN1)/(TP1+TN1+FP1+FN1)
Precision1 = (TP1)/(TP1+FP1)
Recall1 = (TP1)/(TP1+FN1)
F11 = 2*Precision1*Recall1/(Precision1+Recall1)
Accuary1
[1] 0.7953347
Precision1
[1] 0.6184739
Recall1
[1] 0.5904908
F11
[1] 0.6041585
p1 = predict(m1, test.data.set, type = 'response')
prediction from a rank-deficient fit may be misleading
pred2 = prediction(p1,test.data.set$Churn)
perf = performance(pred2,"tpr","fpr")
plot(perf,colorize = T)
t1 = table(test.data.set$Churn, p1 > 0.45)
t1
FALSE TRUE
No 1350 198
Yes 248 317
TN1 = t1[1]
FN1 = t1[2]
FP1 = t1[3]
TP1 = t1[4]
TN1
[1] 1350
FN1
[1] 248
FP1
[1] 198
TP1
[1] 317
Accuary1 = (TP1+TN1)/(TP1+TN1+FP1+FN1)
Precision1 = (TP1)/(TP1+FP1)
Recall1 = (TP1)/(TP1+FN1)
F11 = 2*Precision1*Recall1/(Precision1+Recall1)
Accuary1
[1] 0.7889257
Precision1
[1] 0.615534
Recall1
[1] 0.5610619
F11
[1] 0.587037